{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "20a93957",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8944a8dab0c74b17912752ba1ee006bd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md:   0%|          | 0.00/26.8k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Available Configurations:\n",
      "['ar_en', 'ca_en', 'cy_en', 'de_en', 'en_ar', 'en_ca', 'en_cy', 'en_de', 'en_et', 'en_fa', 'en_id', 'en_ja', 'en_lv', 'en_mn', 'en_sl', 'en_sv-SE', 'en_ta', 'en_tr', 'en_zh-CN', 'es_en', 'et_en', 'fa_en', 'fr_en', 'id_en', 'it_en', 'ja_en', 'lv_en', 'mn_en', 'nl_en', 'pt_en', 'ru_en', 'sl_en', 'sv-SE_en', 'ta_en', 'tr_en', 'zh-CN_en'] 36\n"
     ]
    }
   ],
   "source": [
    "from datasets import get_dataset_config_names,load_dataset\n",
    "from itertools import islice\n",
    "import pyarrow as pa\n",
    "import pyarrow.parquet as pq\n",
    "\n",
    "configs = get_dataset_config_names(\"fixie-ai/covost2\")\n",
    "print(\"Available Configurations:\")\n",
    "print(configs,len(configs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "8b0610d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pyarrow as pa\n",
    "import pyarrow.parquet as pq\n",
    "from tqdm import tqdm\n",
    "from itertools import islice\n",
    "import soundfile as sf\n",
    "import pandas as pd\n",
    "from concurrent.futures import ThreadPoolExecutor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "13db757e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|                                                                                                                    | 0/36 [00:00<?, ?it/s]/usr/lib/python3/dist-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.26.4\n",
      "  warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n",
      "  3%|███                                                                                                         | 1/36 [00:36<21:29, 36.85s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 2283 samples to covost2/covost2_ar_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  6%|█████▊                                                                                                   | 2/36 [05:23<1:44:16, 184.02s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_ca_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  8%|████████▊                                                                                                | 3/36 [05:48<1:01:17, 111.45s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 1241 samples to covost2/covost2_cy_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 11%|███████████▋                                                                                             | 4/36 [10:27<1:34:41, 177.56s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_de_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 14%|██████████████▌                                                                                          | 5/36 [15:29<1:54:48, 222.23s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_en_ar.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 17%|█████████████████▌                                                                                       | 6/36 [20:30<2:04:33, 249.13s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_en_ca.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 19%|████████████████████▍                                                                                    | 7/36 [25:34<2:08:59, 266.87s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_en_cy.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 22%|███████████████████████▎                                                                                 | 8/36 [30:41<2:10:31, 279.70s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_en_de.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 25%|██████████████████████████▎                                                                              | 9/36 [35:57<2:11:00, 291.11s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_en_et.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 28%|████████████████████████████▉                                                                           | 10/36 [41:08<2:08:45, 297.14s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_en_fa.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 31%|███████████████████████████████▊                                                                        | 11/36 [46:22<2:05:59, 302.38s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_en_id.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 33%|██████████████████████████████████▋                                                                     | 12/36 [51:25<2:01:04, 302.70s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_en_ja.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 36%|█████████████████████████████████████▌                                                                  | 13/36 [57:06<2:00:27, 314.23s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_en_lv.parquet\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7856c2bf123848988e4086943f825e9f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md:   0%|          | 0.00/26.8k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 39%|███████████████████████████████████████▋                                                              | 14/36 [1:02:16<1:54:42, 312.85s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_en_mn.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 42%|██████████████████████████████████████████▌                                                           | 15/36 [1:07:15<1:48:01, 308.63s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_en_sl.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 44%|█████████████████████████████████████████████▎                                                        | 16/36 [1:12:14<1:41:56, 305.84s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_en_sv-SE.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 47%|████████████████████████████████████████████████▏                                                     | 17/36 [1:17:17<1:36:33, 304.91s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_en_ta.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 50%|███████████████████████████████████████████████████                                                   | 18/36 [1:22:21<1:31:27, 304.86s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_en_tr.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 53%|█████████████████████████████████████████████████████▊                                                | 19/36 [1:27:23<1:26:07, 303.96s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_en_zh-CN.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 56%|████████████████████████████████████████████████████████▋                                             | 20/36 [1:30:23<1:11:08, 266.77s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error processing es_en: Error opening <_io.BytesIO object at 0x7f38c4a649a0>: Format not recognised.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 58%|████████████████████████████████████████████████████████████▋                                           | 21/36 [1:30:51<48:46, 195.11s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 1782 samples to covost2/covost2_et_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 61%|███████████████████████████████████████████████████████████████▌                                        | 22/36 [1:34:25<46:51, 200.80s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_fa_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 64%|██████████████████████████████████████████████████████████████████▍                                     | 23/36 [1:38:56<48:01, 221.66s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 51000 samples to covost2/covost2_fr_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 67%|█████████████████████████████████████████████████████████████████████▎                                  | 24/36 [1:39:16<32:14, 161.17s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 1243 samples to covost2/covost2_id_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 69%|████████████████████████████████████████████████████████████████████████▏                               | 25/36 [1:42:20<30:48, 168.02s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 31698 samples to covost2/covost2_it_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 72%|███████████████████████████████████████████████████████████████████████████                             | 26/36 [1:42:41<20:39, 123.95s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 1119 samples to covost2/covost2_ja_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 75%|██████████████████████████████████████████████████████████████████████████████▊                          | 27/36 [1:43:08<14:14, 94.93s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 2337 samples to covost2/covost2_lv_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 78%|█████████████████████████████████████████████████████████████████████████████████▋                       | 28/36 [1:43:34<09:52, 74.06s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 2067 samples to covost2/covost2_mn_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 81%|████████████████████████████████████████████████████████████████████████████████████▌                    | 29/36 [1:44:20<07:39, 65.66s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 7108 samples to covost2/covost2_nl_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 83%|███████████████████████████████████████████████████████████████████████████████████████▌                 | 30/36 [1:45:15<06:15, 62.53s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 9158 samples to covost2/covost2_pt_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 86%|██████████████████████████████████████████████████████████████████████████████████████████▍              | 31/36 [1:46:42<05:48, 69.80s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 12112 samples to covost2/covost2_ru_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 89%|█████████████████████████████████████████████████████████████████████████████████████████████▎           | 32/36 [1:47:06<03:44, 56.06s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 1843 samples to covost2/covost2_sl_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 92%|████████████████████████████████████████████████████████████████████████████████████████████████▎        | 33/36 [1:47:28<02:17, 45.93s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 2160 samples to covost2/covost2_sv-SE_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 94%|███████████████████████████████████████████████████████████████████████████████████████████████████▏     | 34/36 [1:47:48<01:16, 38.13s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 1358 samples to covost2/covost2_ta_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████   | 35/36 [1:48:19<00:35, 35.98s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 3966 samples to covost2/covost2_tr_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [1:49:26<00:00, 182.40s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved 7085 samples to covost2/covost2_zh-CN_en.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "parquet_dir = 'covost2'\n",
    "mp3_dir = 'covost-mp3'\n",
    "\n",
    "os.makedirs(parquet_dir, exist_ok=True)\n",
    "os.makedirs(mp3_dir, exist_ok=True)\n",
    "\n",
    "def save_audio_file(example):\n",
    "    try:\n",
    "        audio_info = example.get('audio')\n",
    "        if audio_info:\n",
    "            audio_path = os.path.join(mp3_dir, audio_info['path'])\n",
    "            sf.write(audio_path, audio_info['array'], audio_info['sampling_rate'], format='MP3')\n",
    "        return True\n",
    "    except Exception as e:\n",
    "        print(f\"Error saving audio: {e}\")\n",
    "        return False\n",
    "\n",
    "def process_split(split):\n",
    "    try:\n",
    "        streamed_ds = load_dataset(\"fixie-ai/covost2\", split, streaming=True)['train']\n",
    "        \n",
    "        batch = []\n",
    "        count = 0\n",
    "        with ThreadPoolExecutor() as executor:\n",
    "            for example in islice(streamed_ds, 51_000):\n",
    "                filtered_example = {\n",
    "                    'file': example.get('audio')['path'],\n",
    "                    'sentence': example.get('sentence'),\n",
    "                    'translation': example.get('translation'),\n",
    "                }\n",
    "                batch.append(filtered_example)\n",
    "                executor.submit(save_audio_file, example)\n",
    "                count += 1\n",
    "\n",
    "        if not batch:\n",
    "            print(f\"No valid data found in {split}.\")\n",
    "            return\n",
    "\n",
    "        table = pa.Table.from_pandas(pd.DataFrame(batch))\n",
    "        save_path = os.path.join(parquet_dir, f\"covost2_{split}.parquet\")\n",
    "        pq.write_table(table, save_path)\n",
    "        print(f\"Saved {len(batch)} samples to {save_path}\")\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\"Error processing {split}: {e}\")\n",
    "\n",
    "for split in tqdm(configs):\n",
    "    process_split(split)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "99a99780",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['covost2/covost2_ar_en.parquet',\n",
       " 'covost2/covost2_ca_en.parquet',\n",
       " 'covost2/covost2_cy_en.parquet',\n",
       " 'covost2/covost2_de_en.parquet',\n",
       " 'covost2/covost2_en_ar.parquet',\n",
       " 'covost2/covost2_en_ca.parquet',\n",
       " 'covost2/covost2_en_cy.parquet',\n",
       " 'covost2/covost2_en_de.parquet',\n",
       " 'covost2/covost2_en_et.parquet',\n",
       " 'covost2/covost2_en_fa.parquet',\n",
       " 'covost2/covost2_en_id.parquet',\n",
       " 'covost2/covost2_en_ja.parquet',\n",
       " 'covost2/covost2_en_lv.parquet',\n",
       " 'covost2/covost2_en_mn.parquet',\n",
       " 'covost2/covost2_en_sl.parquet',\n",
       " 'covost2/covost2_en_sv-SE.parquet',\n",
       " 'covost2/covost2_en_ta.parquet',\n",
       " 'covost2/covost2_en_tr.parquet',\n",
       " 'covost2/covost2_en_zh-CN.parquet',\n",
       " 'covost2/covost2_et_en.parquet',\n",
       " 'covost2/covost2_fa_en.parquet',\n",
       " 'covost2/covost2_fr_en.parquet',\n",
       " 'covost2/covost2_id_en.parquet',\n",
       " 'covost2/covost2_it_en.parquet',\n",
       " 'covost2/covost2_ja_en.parquet',\n",
       " 'covost2/covost2_lv_en.parquet',\n",
       " 'covost2/covost2_mn_en.parquet',\n",
       " 'covost2/covost2_nl_en.parquet',\n",
       " 'covost2/covost2_pt_en.parquet',\n",
       " 'covost2/covost2_ru_en.parquet',\n",
       " 'covost2/covost2_sl_en.parquet',\n",
       " 'covost2/covost2_sv-SE_en.parquet',\n",
       " 'covost2/covost2_ta_en.parquet',\n",
       " 'covost2/covost2_tr_en.parquet',\n",
       " 'covost2/covost2_zh-CN_en.parquet']"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "files = sorted(glob('covost2/*.parquet'))\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f246de36",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/lib/python3/dist-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.26.4\n",
      "  warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import random\n",
    "from sklearn.model_selection import train_test_split\n",
    "from langcodes import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b5ba288a",
   "metadata": {},
   "outputs": [],
   "source": [
    "question = [\n",
    "    'please transcribe to {lang}',\n",
    "    'transcribe the audio in {lang}',\n",
    "    'translate the narrative to {lang}',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "14d82760",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_parquet(files[0]).to_dict(orient = 'records')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "74ba6ecd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:07<00:00,  4.38it/s]\n"
     ]
    }
   ],
   "source": [
    "trains, tests = [], []\n",
    "\n",
    "for f in tqdm(files):\n",
    "    to_lang = f.split('_')[-1].split('.')[0]\n",
    "    from_lang = f.split('_')[1]\n",
    "    from_lang = Language.get(from_lang).display_name()\n",
    "    to_lang = Language.get(to_lang).display_name()\n",
    "    df = pd.read_parquet(f).to_dict(orient = 'records')\n",
    "    train, test = train_test_split(df, test_size = 100)\n",
    "    for t in train:\n",
    "        audio_filename = os.path.join('covost-mp3', t['file'])\n",
    "        if not os.path.exists(audio_filename):\n",
    "            continue\n",
    "        trains.append({\n",
    "            'question': random.choice(question).format(lang = to_lang),\n",
    "            'from_language': from_lang,\n",
    "            'to_language': to_lang,\n",
    "            'audio_filename': audio_filename,\n",
    "            'answer': t['translation']\n",
    "        })\n",
    "    for t in test:\n",
    "        audio_filename = os.path.join('covost-mp3', t['file'])\n",
    "        if not os.path.exists(audio_filename):\n",
    "            continue\n",
    "        tests.append({\n",
    "            'question': random.choice(question).format(lang = to_lang),\n",
    "            'from_language': from_lang,\n",
    "            'to_language': to_lang,\n",
    "            'audio_filename': audio_filename,\n",
    "            'answer': t['translation']\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "185f7605",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1054060, 3500)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(trains), len(tests)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "fcbaa253",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset\n",
    "\n",
    "dataset = Dataset.from_list(trains)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "c0c0dcfa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6aabf16a864c4e0a86804730d69e2b23",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "47882725d0084a3391a35f2a7cf0373f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/1055 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bd35c517b9f7430f973083179cc3cfe1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/64.1M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/CoVoST2-Instruction/commit/83aa36186609bc30c4e1e4303e3069e1ec1e54c4', commit_message='Upload dataset', commit_description='', oid='83aa36186609bc30c4e1e4303e3069e1ec1e54c4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/CoVoST2-Instruction', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/CoVoST2-Instruction'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.push_to_hub('mesolitica/CoVoST2-Instruction', split = 'train')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "d950c5f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = Dataset.from_list(tests)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "bf15f442",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "92aaef0f3b5140e9b0f009639004c0f9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dd7c3b5d698d44c18a3ec1a74f601997",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e234cda078d446bbb41253a2f9d52059",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/208k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7720cb08a5324f77aac769b88627e5f2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md:   0%|          | 0.00/446 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/CoVoST2-Instruction/commit/4a16a036634940551fa2b89aafbc7d8aed37c7d6', commit_message='Upload dataset', commit_description='', oid='4a16a036634940551fa2b89aafbc7d8aed37c7d6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/CoVoST2-Instruction', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/CoVoST2-Instruction'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.push_to_hub('mesolitica/CoVoST2-Instruction', split = 'test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "682df6e3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9.8G\tcovost-mp3\r\n"
     ]
    }
   ],
   "source": [
    "!du -hs covost-mp3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "96e4066c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import zipfile\n",
    "import os\n",
    "\n",
    "def zip_folder(folder_path, output_zip_path):\n",
    "    with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:\n",
    "        for root, dirs, files in os.walk(folder_path):\n",
    "            for file in files:\n",
    "                file_path = os.path.join(root, file)\n",
    "                arcname = os.path.relpath(file_path, start=folder_path)\n",
    "                zipf.write(file_path, arcname)\n",
    "\n",
    "zip_folder('covost-mp3', 'covost-mp3.zip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "fc962c73",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2b0aad39216e4824b3f2436a44cb8628",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/9.48G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/CoVoST2-Instruction/commit/6f1bf159d655b28a25f82f0f7ef1dde4e4160a43', commit_message='Upload covost-mp3.zip with huggingface_hub', commit_description='', oid='6f1bf159d655b28a25f82f0f7ef1dde4e4160a43', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/CoVoST2-Instruction', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/CoVoST2-Instruction'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from huggingface_hub import HfApi\n",
    "api = HfApi()\n",
    "\n",
    "api.upload_file(\n",
    "    path_or_fileobj=\"covost-mp3.zip\",\n",
    "    path_in_repo=\"covost-mp3.zip\",\n",
    "    repo_id=\"mesolitica/CoVoST2-Instruction\",\n",
    "    repo_type=\"dataset\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "894dbf17",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
